start_time <- Sys.time()
# Load helper packages
library(data.table)
library(plotly)
library(tidymodels)
library(hydrorecipes)
# new names for predictors
nms_other <- c('datetime',
'precipitation',
'temperature_mean',
'temperature_min',
'temperature_max',
'sea_pressure',
'humidity',
'wind',
'insolation',
'evapotranspiration')Prediction Challenge - Germany
Lagged linear regression model
This model can be interpreted and refined based on the responses of each regressor group. It will not be the most accurate predictor, but it is fast and can help identify the most important components. This method is similar to what is found in (Kennel 2020).
Set-up
Prepare data
outcome <- fread('../../data/Germany/heads.csv')
predictors <- fread('../../data/Germany/input_data.csv')
# make names more verbose
setnames(outcome, c('datetime', 'wl'))
setnames(predictors, nms_other)
# join data and make a numeric time column
dat <- outcome[predictors, on = 'datetime']
# join data and make a numeric time column
dat <- outcome[predictors, on = 'datetime']
# ad hoc estimate of water deficit. Use distributed lag model for predictors
dat[, deficit := cumsum(scale(precipitation)) - cumsum(scale(evapotranspiration))]
dat[, deficit := lm(deficit~splines::ns(datetime, df = 6))$residuals]
varknots <- c(-40, 75)
nms <- paste0('deficit_', 1:(length(varknots) + 3))
dat[, c(nms) := as.data.table(splines::bs(deficit, knots = varknots))]
# ad hoc snow melt
dat[, min_temp_diff := c(0, diff(temperature_min, lag = 1))]
dat[, snow_melt := 0]
dat[min_temp_diff >= 7.1 & month(datetime) %in% c(1:3), snow_melt := 1]
dat[, snow_melt := snow_melt * precipitation]
# separate precipitation into snow and rain
dat[, precipitation_snow := 0]
dat[, precipitation_rain := 0]
dat[temperature_max <= 0, precipitation_snow := precipitation]
dat[temperature_max > 0, precipitation_rain := precipitation]
# scaled precipitation based on ET
dat[, precip_evapo := precipitation_rain * 1.0 / evapotranspiration]
# temperature changes
dat[, temperature_diff := temperature_max - temperature_mean]
# clean-up
dat[, deficit := NULL]
dat[, precipitation := NULL]
# create feature dataset
all <- recipe(wl~., dat) |>
step_distributed_lag(precipitation_rain,
knots = log_lags(45, 270)) |>
step_distributed_lag(precipitation_snow,
knots = log_lags(15, 90)) |>
step_distributed_lag(precip_evapo,
knots = log_lags(30, 180)) |>
step_distributed_lag(snow_melt,
knots = log_lags(30, 180)) |>
step_distributed_lag(starts_with("deficit"),
knots = log_lags(51, 365 * 3.6)) |>
step_ns(temperature_diff, deg_free = 12)|>
step_rm(datetime) |>
step_corr(all_predictors()) |>
prep() |>
bake(new_data = NULL)
setDT(all)Fit model
fit <- lm(wl~., all)Make predictions
dat <- cbind(dat, predict(fit, all, interval = "prediction"))Plot results
A median filter to smooth the results as they appeared to have too much variance.
dat[, fit := runmed(fit, 5)]
p1 <- plot_ly(dat[datetime > as.POSIXct('2002-04-30', tz = 'UTC')],
x = ~datetime,
y = ~wl,
type = "scatter",
mode = "lines",
name = "Water Level",
line = list(color = "#808080")) |>
add_lines(x = ~datetime, y = ~fit, name = "Predictions" ,
line = list(color = "#6000FF60"))
p2 <- plot_ly(dat[year(datetime) > 2001], x = ~datetime,
y = ~wl - fit,
type = "scatter",
mode = "lines",
name = "Residuals",
line = list(color = "#808080"))
subplot(p1, p2, shareX = TRUE, nrows = 2)sum((dat$wl-dat$fit)^2, na.rm = TRUE)[1] 47.53229
Output submission
submission_times <- fread("submission_form_Germany.csv")
submission <- dat[datetime %in% submission_times$Date]
submission <- submission[, list(Date = datetime,
`Simulated Head` = fit,
`95% Lower Bound` = lwr,
`95% Upper Bound` = upr)]
fwrite(submission, "submission_form_Germany.csv")
end_time <- Sys.time()Timings
Total elapsed time is 4.4 seconds.
References
Kennel, Jonathan. 2020. “High Frequency Water Level Responses to Natural Signals.” PhD thesis, 50 Stone Rd E, Guelph, ON N1G 2W1, Canada: University of Guelph. http://hdl.handle.net/10214/17890.
Computer and software specs
Macbook Air M1 2020
16 GB Ram
sessionInfo()R version 4.2.1 (2022-06-23)
Platform: aarch64-apple-darwin20 (64-bit)
Running under: macOS Ventura 13.1
Matrix products: default
LAPACK: /Library/Frameworks/R.framework/Versions/4.2-arm64/Resources/lib/libRlapack.dylib
locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
attached base packages:
[1] stats graphics grDevices utils datasets methods base
other attached packages:
[1] hydrorecipes_0.0.5 yardstick_1.1.0 workflowsets_1.0.0
[4] workflows_1.1.2 tune_1.0.1 tidyr_1.2.1
[7] tibble_3.1.8 rsample_1.1.1 recipes_1.0.3
[10] purrr_1.0.0 parsnip_1.0.3 modeldata_1.0.1
[13] infer_1.0.4 dplyr_1.0.10 dials_1.1.0
[16] scales_1.2.1 broom_1.0.2 tidymodels_1.0.0.9000
[19] plotly_4.10.1 ggplot2_3.4.0 data.table_1.14.6
loaded via a namespace (and not attached):
[1] lubridate_1.9.0 DiceDesign_1.9 httr_1.4.4
[4] tools_4.2.1 backports_1.4.1 utf8_1.2.2
[7] R6_2.5.1 rpart_4.1.19 DBI_1.1.3
[10] lazyeval_0.2.2 colorspace_2.0-3 nnet_7.3-18
[13] withr_2.5.0 tidyselect_1.2.0 compiler_4.2.1
[16] cli_3.5.0 stringr_1.5.0 digest_0.6.31
[19] rmarkdown_2.19 pkgconfig_2.0.3 htmltools_0.5.4
[22] parallelly_1.33.0 lhs_1.1.6 fastmap_1.1.0
[25] collapse_1.8.9 htmlwidgets_1.6.0 rlang_1.0.6
[28] rstudioapi_0.14 generics_0.1.3 jsonlite_1.8.4
[31] crosstalk_1.2.0 magrittr_2.0.3 Matrix_1.5-3
[34] Rcpp_1.0.9 munsell_0.5.0 fansi_1.0.3
[37] GPfit_1.0-8 lifecycle_1.0.3 furrr_0.3.1
[40] stringi_1.7.8 yaml_2.3.6 MASS_7.3-58.1
[43] grid_4.2.1 parallel_4.2.1 listenv_0.9.0
[46] lattice_0.20-45 earthtide_0.0.14 splines_4.2.1
[49] knitr_1.41 pillar_1.8.1 fftw_1.0-7
[52] future.apply_1.10.0 codetools_0.2-18 glue_1.6.2
[55] evaluate_0.19 RcppParallel_5.1.5 vctrs_0.5.1
[58] foreach_1.5.2 gtable_0.3.1 future_1.30.0
[61] assertthat_0.2.1 xfun_0.36 gower_1.0.1
[64] prodlim_2019.11.13 class_7.3-20 survival_3.4-0
[67] viridisLite_0.4.1 timeDate_4021.107 iterators_1.0.14
[70] hardhat_1.2.0 lava_1.7.0 timechange_0.1.1
[73] globals_0.16.2 ellipsis_0.3.2 ipred_0.9-13